library(tidyverse)
library(janitor)
library(kableExtra)

Import Data

nhanes <- read_csv("data/nhanes.csv") %>%
  clean_names()

select

nhanes %>%
  select(age)

Other ways we can select variables

Select multiple variables by listing them:

nhanes %>%
  select(height, weight)

Other ways we can select variables

You can use one_of() to do the same thing:

nhanes %>%
  select(one_of(c("height", "weight")))

Other ways we can select variables

contain() selects variable with certain text in the variable name:

nhanes %>%
  select(contains("age"))

Other ways we can select variables

You can select a range of columns using the var1:var2 pattern

nhanes %>%
  select(weight:bmi)

Other ways we can select variables

Drop variables using the -var format:

nhanes %>%
  select(-id)

Other ways we can select variables

Drop a set of variables using the -(var1:var2) format:

nhanes %>%
  select(-(id:education))

mutate

1. Create a new variable with a specific value

nhanes %>%
  mutate(country = "United States") %>% 
  select(country)

2. Create a new variable based on other variables

nhanes %>%
  mutate(height_inches = height / 2.54) %>% 
  select(contains("height"))

3. Change an existing variable

nhanes %>%
  mutate(bmi = round(bmi, digits = 1)) %>% 
  select(bmi)

filter

filter

nhanes %>% 
  filter(gender == "female") %>%
  select(gender)

filter

nhanes %>% 
  filter(health_gen != "Good") %>%
  select(health_gen)

filter

nhanes %>% 
  filter(health_gen != "Good" | health_gen != "VGood" | health_gen != "Excellent") %>%
  select(health_gen)

filter

nhanes %>% 
  filter(health_gen %in% c("Good", "VGood", "Excellent")) %>%
  select(health_gen)

filter

You can chain together multiple filter functions. Doing it this way, we don’t have create complex logic in one line.

nhanes %>% 
  filter(gender == "male") %>%
  filter(health_gen %in% c("Good", "VGood", "Excellent")) %>%
  select(gender, health_gen)

filter

nhanes %>% 
  filter(age > 50)

filter

You can drop NAs with !is.na()

nhanes %>% 
  filter(age > 50) %>% 
  filter(!is.na(marital_status)) %>%
  select(age, marital_status)

filter

You can also drop NAs with drop_na()

nhanes %>% 
  filter(age > 50) %>% 
  drop_na(marital_status) %>%
  select(age, marital_status)

summarize

This doesn’t work! Notice what the result is.

nhanes %>% 
  summarize(mean_active_days = mean(phys_active_days))

summarize

Add na.rm = TRUE to make this work.

nhanes %>% 
  summarize(mean_active_days = mean(phys_active_days,
                                    na.rm = TRUE))

summarize

You can have multiple arguments in each usage of summarize.

nhanes %>% 
  summarize(mean_active_days = mean(phys_active_days, na.rm = TRUE),
            median_active_days = median(phys_active_days, na.rm = TRUE),
            number_of_responses = n())

group_by

nhanes %>% 
  group_by(age_decade) %>%
  summarize(mean_active_days = mean(phys_active_days,
                                    na.rm = TRUE)) 

group_by example

group_by works for multiple groups.

nhanes %>% 
  group_by(age_decade, gender) %>%
  summarize(mean_active_days = mean(phys_active_days,
                                    na.rm = TRUE)) 

count

If you just want to count the number of things per group, you can use count.

nhanes %>% 
  count(age_decade)

count

You can also count by multiple groups.

nhanes %>% 
  count(age_decade, gender)

arrange

arrange example

R arranges in ascending order by default

nhanes %>% 
  group_by(age_decade, gender) %>%
  summarize(mean_active_days = mean(phys_active_days,
                                    na.rm = TRUE)) %>% 
  arrange(mean_active_days) 

arrange example

You can also arrange in descending order

nhanes %>% 
  group_by(age_decade, gender) %>%
  summarize(mean_active_days = mean(phys_active_days,
                                    na.rm = TRUE)) %>% 
  arrange(desc(mean_active_days)) 

Crosstabs

Sometimes you want your results in a crosstab. We’ll use the tabyl function in janitor package to make crosstabs automatically.

nhanes %>% 
  tabyl(gender, age_decade) 

adorn_ functions

janitor has a set of functions that all start with adorn_ that add a number of things to our crosstabs. You call them after tabyl.

Add totals

nhanes %>% 
  tabyl(gender, age_decade) %>% 
  adorn_totals(c("row", "col"))

Add percentages

nhanes %>% 
  tabyl(gender, age_decade) %>% 
  adorn_totals(c("row", "col")) %>% 
  adorn_percentages()

Format percentages

nhanes %>% 
  tabyl(gender, age_decade) %>% 
  adorn_totals(c("row", "col")) %>% 
  adorn_percentages() %>% 
  adorn_pct_formatting() 

Include n alongside percentages

nhanes %>% 
  tabyl(gender, age_decade) %>% 
  adorn_totals(c("row", "col")) %>% 
  adorn_percentages() %>% 
  adorn_pct_formatting() %>% 
  adorn_ns()

Add title

nhanes %>% 
  tabyl(gender, age_decade) %>% 
  adorn_totals(c("row", "col")) %>% 
  adorn_percentages() %>% 
  adorn_pct_formatting() %>% 
  adorn_ns() %>% 
  adorn_title()

Three (or more) way crosstabs

You can also do three (or more) way crosstabs automatically by adding more variables to the tabyl function.

nhanes %>% 
  tabyl(gender, age_decade, education) %>%
  adorn_totals(c("row", "col")) %>% 
  adorn_percentages() %>% 
  adorn_pct_formatting() %>% 
  adorn_ns() %>% 
  adorn_title() 
## $`8th Grade`
##         age_decade                                                    
##  gender        0-9    10-19     20-29      30-39      40-49      50-59
##  female   0.0% (0) 0.0% (0) 9.1% (19) 18.7% (39) 16.3% (34) 15.3% (32)
##    male   0.0% (0) 0.0% (0) 7.4% (18) 14.0% (34) 22.3% (54) 14.0% (34)
##   Total   0.0% (0) 0.0% (0) 8.2% (37) 16.2% (73) 19.5% (88) 14.6% (66)
##                                               
##       60-69        70+        NA_        Total
##  12.4% (26) 17.7% (37) 10.5% (22) 100.0% (209)
##  16.9% (41) 10.3% (25) 14.9% (36) 100.0% (242)
##  14.9% (67) 13.7% (62) 12.9% (58) 100.0% (451)
## 
## $`9 - 11th Grade`
##         age_decade                                             
##  gender        0-9    10-19       20-29       30-39       40-49
##  female   0.0% (0) 0.0% (0) 17.9%  (72) 15.2%  (61) 14.9%  (60)
##    male   0.0% (0) 0.0% (0) 20.6% (100) 16.7%  (81) 22.2% (108)
##   Total   0.0% (0) 0.0% (0) 19.4% (172) 16.0% (142) 18.9% (168)
##                                                          
##        50-59      60-69        70+       NA_        Total
##  18.7%  (75) 12.4% (50) 12.9% (52) 8.0% (32) 100.0% (402)
##  18.3%  (89)  9.7% (47)  9.1% (44) 3.5% (17) 100.0% (486)
##  18.5% (164) 10.9% (97) 10.8% (96) 5.5% (49) 100.0% (888)
## 
## $`College Grad`
##         age_decade                                             
##  gender        0-9    10-19       20-29       30-39       40-49
##  female   0.0% (0) 0.0% (0) 14.8% (163) 21.2% (233) 25.7% (282)
##    male   0.0% (0) 0.0% (0) 13.0% (130) 21.4% (214) 19.9% (199)
##   Total   0.0% (0) 0.0% (0) 14.0% (293) 21.3% (447) 22.9% (481)
##                                                            
##        50-59       60-69        70+       NA_         Total
##  19.7% (217) 10.9% (120) 5.2%  (57) 2.5% (27) 100.0% (1099)
##  20.0% (200) 16.0% (160) 6.4%  (64) 3.2% (32) 100.0%  (999)
##  19.9% (417) 13.3% (280) 5.8% (121) 2.8% (59) 100.0% (2098)
## 
## $`High School`
##         age_decade                                             
##  gender        0-9    10-19       20-29       30-39       40-49
##  female   0.0% (0) 0.0% (0) 20.3% (156) 13.6% (105) 17.5% (135)
##    male   0.0% (0) 0.0% (0) 21.0% (157) 15.7% (117) 22.5% (168)
##   Total   0.0% (0) 0.0% (0) 20.6% (313) 14.6% (222) 20.0% (303)
##                                                             
##        50-59       60-69         70+       NA_         Total
##  15.1% (116) 13.8% (106) 12.5%  (96) 7.3% (56) 100.0%  (770)
##  20.7% (155) 10.0%  (75)  5.9%  (44) 4.1% (31) 100.0%  (747)
##  17.9% (271) 11.9% (181)  9.2% (140) 5.7% (87) 100.0% (1517)
## 
## $`Some College`
##         age_decade                                             
##  gender        0-9    10-19       20-29       30-39       40-49
##  female   0.0% (0) 0.0% (0) 22.6% (271) 20.0% (239) 14.0% (167)
##    male   0.0% (0) 0.0% (0) 24.9% (266) 19.9% (213) 17.6% (188)
##   Total   0.0% (0) 0.0% (0) 23.7% (537) 19.9% (452) 15.7% (355)
##                                                            
##        50-59       60-69        70+       NA_         Total
##  15.3% (183) 14.8% (177) 8.7% (104) 4.7% (56) 100.0% (1197)
##  19.0% (203) 10.8% (116) 5.8%  (62) 2.1% (22) 100.0% (1070)
##  17.0% (386) 12.9% (293) 7.3% (166) 3.4% (78) 100.0% (2267)
## 
## $NA_
##           age_decade                                                 
##  gender          0-9        10-19    20-29    30-39    40-49    50-59
##  female 48.6%  (653) 50.9%  (684) 0.0% (0) 0.0% (0) 0.2% (3) 0.0% (0)
##    male 51.4%  (738) 48.1%  (690) 0.3% (4) 0.1% (2) 0.0% (0) 0.0% (0)
##   Total 50.1% (1391) 49.4% (1374) 0.1% (4) 0.1% (2) 0.1% (3) 0.0% (0)
##                                          
##     60-69      70+      NA_         Total
##  0.1% (1) 0.1% (2) 0.0% (0) 100.0% (1343)
##  0.0% (0) 0.0% (0) 0.1% (2) 100.0% (1436)
##  0.0% (1) 0.1% (2) 0.1% (2) 100.0% (2779)

Create new data frames

Sometimes you want to save the results of your work to a new data frame.

phys_activity_by_age <- nhanes %>% 
  group_by(age_decade) %>% #<<
  summarize(mean_active_days = mean(phys_active_days,
                                    na.rm = TRUE)) %>% 
  drop_na()
phys_activity_by_age